Predict sales prices and practice feature engineering, RFs, and gradient boosting
rm(list=ls())
PROJ_PATH <- '~/Documents/kaggle/house_prices'
load(file.path(PROJ_PATH, 'data/house_prices.Rdata'))
# Plotting functions
hist2 <- function(..., breaks=30, col='darkgray', xlab=NULL){
hist(..., breaks=breaks, col=col, border=col, xlab=xlab)
}
barplot2 <- function(..., col='darkgray', xlab=NULL){
barplot(..., col=col, border=col, horiz=TRUE)
}
plot2 <- function(..., col=adjustcolor('gray30', alpha.f=0.2), bty='n'){
plot(..., col=col, bty=bty)
}
Data description
dim(train)
## [1] 1460 81
names(train)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
Histograms of numeric variables
par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))
for (ni in num_idx){
hist2(train[[ni]], main=nm[ni])
}
hist2(train[['SalePrice']], main='SalePrice')
Frequencies of categorical variables
par(las=2)
par(mar=c(3,4,3,3))
par(mfrow = c(1, 4))
for (ci in cat_idx){
barplot2(table(train[[ci]]), main=nm[ci])
}
Missingness of variables (only those with any values missing)
missing_perc <- sort(sapply(train, function(x) sum(is.na(x)) / length(x)))
par(las=2)
par(mar=c(3,7,2,2))
barplot2(missing_perc[missing_perc>0],
cex.names=0.6, cex.axis=0.6, xlim=c(0,1),
main='Missingness')
Relation of each variable to price
par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))
for (ni in num_idx){
plot2(train[,c(nm[ni], 'SalePrice'),],
main=nm[ni], ylim=range(train$SalePrice))
}
par(mar=c(3,3,3,3))
par(mfrow = c(1, 4))
for (ni in cat_idx){
plot2(factor(train[[ni]]), train$SalePrice,
main=nm[ni], ylim=range(train$SalePrice),
frame=FALSE)
}
Top correlated numeric variables (pairwise-complete observations)
cor_mat <- cor(train[,num_idx], use='pairwise.complete')
cor_mat[!upper.tri(cor_mat)] <- NA
not_na_mat <- !is.na(as.matrix(train[,num_idx]))
pairwise_completeness <- t(not_na_mat) %*% not_na_mat / nrow(train)
cor_df <- data.frame(x1=rep(colnames(cor_mat), each=ncol(cor_mat)),
x2=rep(colnames(cor_mat), times=ncol(cor_mat)),
cor=as.vector(cor_mat),
perc_complete=as.vector(pairwise_completeness))
cor_df <- cor_df[!is.na(cor_df$cor),]
cor_df <- cor_df[order(abs(cor_df$cor), decreasing=TRUE),]
knitr::kable(cor_df[abs(cor_df$cor) > 0.50,], row.names=FALSE)
| x1 | x2 | cor | perc_complete |
|---|---|---|---|
| GarageArea | GarageCars | 0.8824754 | 1.0000000 |
| GarageYrBlt | YearBuilt | 0.8256675 | 0.9445205 |
| TotRmsAbvGrd | GrLivArea | 0.8254894 | 1.0000000 |
| X1stFlrSF | TotalBsmtSF | 0.8195300 | 1.0000000 |
| GrLivArea | X2ndFlrSF | 0.6875011 | 1.0000000 |
| TotRmsAbvGrd | BedroomAbvGr | 0.6766199 | 1.0000000 |
| BsmtFullBath | BsmtFinSF1 | 0.6492118 | 1.0000000 |
| GarageYrBlt | YearRemodAdd | 0.6422768 | 0.9445205 |
| FullBath | GrLivArea | 0.6300116 | 1.0000000 |
| TotRmsAbvGrd | X2ndFlrSF | 0.6164226 | 1.0000000 |
| HalfBath | X2ndFlrSF | 0.6097073 | 1.0000000 |
| GarageCars | OverallQual | 0.6006707 | 1.0000000 |
| GrLivArea | OverallQual | 0.5930074 | 1.0000000 |
| YearRemodAdd | YearBuilt | 0.5928550 | 1.0000000 |
| GarageCars | GarageYrBlt | 0.5889200 | 0.9445205 |
| YearBuilt | OverallQual | 0.5723228 | 1.0000000 |
| GrLivArea | X1stFlrSF | 0.5660240 | 1.0000000 |
| GarageArea | GarageYrBlt | 0.5645671 | 0.9445205 |
| GarageArea | OverallQual | 0.5620218 | 1.0000000 |
| TotRmsAbvGrd | FullBath | 0.5547843 | 1.0000000 |
| YearRemodAdd | OverallQual | 0.5506839 | 1.0000000 |
| FullBath | OverallQual | 0.5505997 | 1.0000000 |
| GarageYrBlt | OverallQual | 0.5477658 | 0.9445205 |
| GarageCars | YearBuilt | 0.5378501 | 1.0000000 |
| TotalBsmtSF | OverallQual | 0.5378085 | 1.0000000 |
| TotalBsmtSF | BsmtFinSF1 | 0.5223961 | 1.0000000 |
| BedroomAbvGr | GrLivArea | 0.5212695 | 1.0000000 |
| BedroomAbvGr | X2ndFlrSF | 0.5029006 | 1.0000000 |